import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import cluster, mixture # For clustering
import types
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import seaborn as sns
init_notebook_mode(connected=True)
from IPython.display import display
%matplotlib inline
# # importing the required module
# import timeit
# # code snippet to be executed only once
# mysetup = "from math import sqrt"
# # code snippet whose execution time is to be measured
# mycode = '''
# mylist=[]
# for x in range(100):
# mylist.append(sqrt(x))
# '''
# # timeit statement
# print(timeit.timeit(setup = mysetup,
# stmt = mycode,
# number = 1000000))
#data load
df = pd.read_csv("us_2018.csv")
df = df.sample(frac=0.10, random_state=1) #taking 10% sample for now
id_col = ['customer_id']
num_cols = [x for x in df.columns if x not in id_col]
df.drop(['customer_id'],inplace=True, axis=1)
df_bu = df.copy()
df.head(2)
# #for all clustering
# df2 = df.copy()
# cols_to_norm = ['free_units','paid_units','num_streaming_session_2018']
# df2[cols_to_norm] = MinMaxScaler().fit_transform(df2[cols_to_norm])
# x1 = np.array(df2.sample(frac=0.10, random_state=1)["paid_units"])
# x2 = np.array(df2.sample(frac=0.10, random_state=1)["num_streaming_session_2018"])
# X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
df.info()
#plot histograms
#function for histogram for customer attrition types
def histogram(column) :
trace1 = go.Histogram(x = df[column],
histnorm= "percent",
name = "US Customers 2018",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1]
layout = go.Layout(dict(title = column + " distribution of US Customers",
plot_bgcolor = "rgb(255,255,255)",
paper_bgcolor = "rgb(255,255,255)",
xaxis = dict(gridcolor = 'rgb(80, 80, 80)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(80, 80, 80)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
iplot(fig)
#for all numeric columns plot histogram
for i in num_cols :
histogram(i)
#for all numeric columns plot histogram
cols_to_norm = ['free_units','paid_units','num_streaming_session_2018']
df[cols_to_norm] = MinMaxScaler().fit_transform(df[cols_to_norm])
#df[cols_to_norm] = StandardScaler().fit_transform(df[cols_to_norm])
#plot histograms again
#function for histogram for customer attrition types
def histogram(column) :
trace1 = go.Histogram(x = df[column],
histnorm= "percent",
name = "US Customers 2018",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1]
layout = go.Layout(dict(title = column + " distribution of US Customers",
plot_bgcolor = "rgb(255,255,255)",
paper_bgcolor = "rgb(255,255,255)",
xaxis = dict(gridcolor = 'rgb(80, 80, 80)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(80, 80, 80)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
iplot(fig)
#for all numeric columns plot histogram
for i in cols_to_norm :
histogram(i)
#list(set(num_cols) - set(cols_to_norm))
#scatter plots
# creating trace1
trace1 =go.Scatter(
x = df.tenure,
y = df.free_units,
mode = "markers",
name = "tenure and free units",
marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
)
data = [trace1]
layout = dict(title = 'tenure vs free units (normalized)',
xaxis= dict(title= 'Tenure',ticklen= 5,zeroline= False),
yaxis= dict(title= 'free units',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
#scatter plots
# creating trace1
trace1 =go.Scatter(
x = df.tenure,
y = df.paid_units,
mode = "markers",
name = "tenure and paid units",
marker = dict(color = 'rgba(133, 239, 172, 0.9)'),
)
data = [trace1]
layout = dict(title = 'tenure vs paid units (normalized)',
xaxis= dict(title= 'Tenure',ticklen= 5,zeroline= False),
yaxis= dict(title= 'paid units',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
#scatter plots
# creating trace1
trace1 =go.Scatter(
x = df.tenure,
y = df.num_streaming_session_2018,
mode = "markers",
name = "tenure and # of reading sessions units",
marker = dict(color = 'rgba(133, 239, 172, 0.9)'),
)
data = [trace1]
layout = dict(title = 'tenure vs # of streaming sessions (normalized)',
xaxis= dict(title= 'Tenure',ticklen= 5,zeroline= False),
yaxis= dict(title= '# of streaming sessions',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
csm = list(df["count_streaming_month"].unique())
csm.sort()
#1. iterate through each level of discrete variable and create trace, and plot them on the same window
#function for boxplots
def boxplot(target, disc_var_name, disc_var): #column is target, disc_var (crm) is the levels (x-axis)
levels = len(disc_var)
for i in range(len(disc_var)):
globals()['trace%s' % disc_var[i]] = go.Box(y = df[df[disc_var_name] == disc_var[i]][target],
name = str(disc_var[i]),
marker = dict(color = 'rgb(12, 128, 128)'))
data = []
data.append(globals()['trace%s' % disc_var[i]])
layout = go.Layout(title=str(target) + " " + str(disc_var_name),
xaxis=go.layout.XAxis(
title = str(disc_var_name)),
yaxis=go.layout.YAxis(
title = str(target)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
boxplot("num_streaming_session_2018","count_streaming_month",csm)
boxplot("free_units","count_streaming_month",csm)
boxplot("paid_units","count_streaming_month",csm)
#Scatter plot matrix using seaborn
sns.set(style="ticks")
sns.pairplot(df)
#scatter plots
# creating trace1
trace1 =go.Scatter(
x = df.paid_units,
y = df.num_streaming_session_2018,
mode = "markers",
name = "paid units and number of streaming sessions",
marker = dict(color = 'rgba(133, 239, 172, 0.9)'),
)
data = [trace1]
layout = dict(title = 'tenure vs paid units (normalized)',
xaxis= dict(title= 'paid units',ticklen= 5,zeroline= False),
yaxis= dict(title= 'number of streaming sessions',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
# clustering dataset
# determine k using elbow method
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
x1 = np.array(df["paid_units"])
x2 = np.array(df["num_streaming_session_2018"])
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
# k means determine k
distortions = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
# Plot the elbow
plt.rcParams["figure.figsize"] = [16,9]
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
#k = 5
kmeans = KMeans(n_clusters=5, random_state=101)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='plasma')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=1.0);
#Clustering Result
iid = pd.DataFrame(y_kmeans)[0].value_counts(normalize=True).index
vid = pd.Series(pd.DataFrame(y_kmeans)[0].value_counts(normalize=True).values*100)
for i, j in zip(iid, vid):
print("Cluster " + str(i) + " comprised of " + str(round(j,1)) + "%")
#merge back the clustering result to the non-normalized vectors
df_cl = df_bu.copy()
df_cl["kmeans_cluster_label"] = y_kmeans
pd.options.display.max_columns = None
display(round(df_cl.groupby(by=["kmeans_cluster_label"]).mean(),3))
pd.options.display.max_columns = None
display(round(df_cl.groupby(by=["kmeans_cluster_label"]).describe(),3))
import time
import warnings
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
plot_num = 1
# Set up cluster parameters
default_base = {'quantile': .3,
'eps': .3,
'damping': .9,
'preference': -200,
'n_neighbors': 10,
'n_clusters': 2,
'min_samples': 10,
'xi': 0.05,
'min_cluster_size': 0.01}
datasets = [(X, default_base)]
for i_dataset, (dataset, algo_params) in enumerate(datasets):
# update parameters with dataset-specific values
params = default_base.copy()
params.update(algo_params)
X_ = dataset
# normalize dataset for easier parameter selection
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X_, n_neighbors=params['n_neighbors'], include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
ward = cluster.AgglomerativeClustering(
n_clusters=params['n_clusters'], linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(
n_clusters=params['n_clusters'], eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=params['eps'])
# optics = cluster.OPTICS(min_samples=params['min_samples'],
# xi=params['xi'],
# min_cluster_size=params['min_cluster_size'])
affinity_propagation = cluster.AffinityPropagation(
damping=params['damping'], preference=params['preference'])
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock",
n_clusters=params['n_clusters'], connectivity=connectivity)
birch = cluster.Birch(n_clusters=params['n_clusters'])
gmm = mixture.GaussianMixture(
n_components=params['n_clusters'], covariance_type='full')
clustering_algorithms = (
('MiniBatchKMeans', two_means),
('AffinityPropagation', affinity_propagation),
('MeanShift', ms),
('SpectralClustering', spectral),
('Ward', ward),
('AgglomerativeClustering', average_linkage),
('DBSCAN', dbscan),
# ('OPTICS', optics),
('Birch', birch),
('GaussianMixture', gmm)
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the " +
"connectivity matrix is [0-9]{1,2}" +
" > 1. Completing it to avoid stopping the tree early.",
category=UserWarning)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding" +
" may not work as expected.",
category=UserWarning)
algorithm.fit(X_)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X_)
plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))
# add black color for outliers (if any)
colors = np.append(colors, ["#000000"])
plt.scatter(X_[:, 0], X_[:, 1], s=10, color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=10,
horizontalalignment='right')
plot_num += 1
plt.show()